# =============================================================================
# PYTHON FILE 1 --- COUNT RELEVANT WORDS
# =============================================================================
# =============================================================================
# PACKAGES
# =============================================================================
from __future__ import division

import os
import pandas as pd
import nltk
import itertools

from nltk.corpus import stopwords
from string import punctuation
from string import digits


# =============================================================================
# SETTINGS
# =============================================================================
os.chdir(r'')


#define a list of words to check, and create pairs of combinations of these words
wordlist=['storm','storms','superstorm','hurricane','hurricanes', 'fema','tornado', 'tornadoes','floodplain']


#now, defined pairs explicitly
#This section makes a pair for all hurricanes with retired names since 2000 - these are the deadliest, costliest storms

hurricane = ['hurricane']
hurricane = hurricane + ['keith','allison','iris','michelle','isidore','lili','fabian','isabel','juan','charley','frances','ivan','jeanne','dennis','katrina','rita','stan','wilma','dean','felix','noel','gustav','ike','paloma','igor','tomas','irene','sandy','ingrid','erika','joaquin','matthew','otto']
hurricane_temp = list(itertools.permutations(hurricane,2))

hurricane_names = []
for hurricanes in hurricane_temp:
    if hurricanes[0]=='hurricane':
        hurricane_names.append(hurricanes)

pairlist = [('flood','risk'),('flood','insurance'),('flood','ins'),('flood','plain'),('flood','risk'),('flood','damage'),('flood','zone'),('flood','zones'),('flood','protection'),('flood','safe'),
            ('hurricane', 'zone'),('hurricane', 'zones'),('hurricane','shutter'),('hurricane','shutters'),('hurricane','shelter'),('hurricane','shelters'),('hurricane','protection'),('hurricane','safe'),('hurricane','impact'),('hurricane','curtains'),
            ('sea','level'),
            ('storm','zone'),('storm','zones'),('storm', 'window'),('storm', 'windows'),('storm','door'),('storm','doors'),('storm','water'),('storm','protection'),('storm','safe'),
            ('tornado','shutter'),('tornado','shutters'),('tornado','shelter'),('tornado','shelters')]

pairlist.append(hurricane_names)

#remove stopwords
#install the package with the code: nltk.download('stopwords')
remove_set = set(stopwords.words('english'))

# =============================================================================
# PREPROCESSING FUNCTION
# =============================================================================

def count_uni_bigrams(file_name):

     listings = open(file_name, encoding="utf-8", errors = "ignore").read()

     # Splits listings by lines- listings with line breaks within them will not be fully read
     listings_list = listings.splitlines()

     #define our counters
     word_counts=[]
     pair_counts=[]

     #define holders for counted strings
     word_id=[]
     pair_id=[]

     for listing in listings_list:
     # set counters = 0
         word_counter=0
         pair_counter=0

         word_flag = ''
         pair_flag = ''

     # Make all listings lowercase
         listing_processed=listing.lower()

     # Remove punctuation and numbers from listings
         for p in list(punctuation):
             listing_processed=listing_processed.replace(p,' ')
         for n in list(digits):
             listing_processed=listing_processed.replace(n,' ')
         listing_processed = filter(lambda w: not w in remove_set, listing_processed.split())

     # Split listings into words and word pairs
         bigram_all=nltk.bigrams(listing_processed)

     # First loop counts number of occurences of these words
         for word in listing_processed:
             if word in wordlist:
                 word_counter=word_counter+1
                 word_flag= word_flag + word + '|'
         word_counts.append(word_counter)
         word_id.append(word_flag)

     # Second loop counts occurences of these words in any sequence as a pair
         for bigram in bigram_all:
             #print bigram
             for pair in pairlist:
                 if bigram[0]==pair[0] and bigram[1]==pair[1]:
                     pair_counter=pair_counter+1
                     pair_flag = pair_flag + pair[0] + ' ' + pair[1] + '|'
                 if bigram[0]==pair[1] and bigram[1]==pair[0]:
                     pair_counter=pair_counter+1
                     pair_flag = pair_flag + pair[1] + ' ' + pair[0] + '|'
         pair_counts.append(pair_counter)
         pair_id.append(pair_flag)

     # Appends original string with the counters
     output = pd.DataFrame(
         {'listings_list': listings_list,
          'word_counts': word_counts,
          'pair_counts': pair_counts,
          'word_id': word_id,
          'pair_id': pair_id
         })

     return output

# =============================================================================
# PREPROCESSING
# =============================================================================
fs_file =  "data/pseudo/FS_listings.txt"

output = count_uni_bigrams(fs_file)
output.to_csv('replication output/FourState_FS_listings_word_pairs_count_flags.csv', header = False, index = False)

fr_file = "data/pseudo/FR_listings.txt"

output = count_uni_bigrams(fr_file)
output.to_csv('replication output/FourState_FR_listings_word_pairs_count_flags.csv', header = False, index = False)










































